/*Program: 01_ids.sas
  Author:  Kamila Sommer 
  Purpose: This program creates a database that contains all the identifiers for each record.  These
           identifiers include:
           * The teditseq variable is created by TransUnion and is unique to each year.
           * The matchkey variable is created by TransUnion and is common to 1997, 1999, and 2001.
           * The person variable is created by Bob and Glenn and is common to 2003 and 2004.
           We want to use the person variable as our identifier across time and datasets.

    Note: 147 observations in only 1997.
          388 obs in only 1999.
          416 obs in 1997 and 1999 that are not in 2001.*/



libname in '/data/';
libname out '/data/';
libname in_2003 '/data/';
libname in_07 '/data'; 
libname in_08 '/data'; 
libname in_10 '/data'; 

options linesize=100 mprint;

/* Macro that strips datasets down to only teditseq and matchkey.*/
%macro matchkey(yr);

       /* The score dataset contains matchkey and the teditseq for 97, 99, 01.*/
       data ids&yr;
       	    set in.score&yr (keep=matchkey teditseq rename=teditseq=teditseq&yr.);
       run;

       proc sort data=ids&yr;
       	    by matchkey;
       run;

       /* This step reads in the data from three text files (97, 99, 01) that match 'matchkey' to the 2003 value of 'teditseq'.*/
       /* These 97 and 99 files are nearly subsets of the 01 file, but some records only exist in one or both of the earlier years,
       	  so we read in all 3 files individually.*/

       x dos2unix "/data/June&yr._Match03.txt";

       data match&yr;
       	    infile "/ata/June&yr._Match03.txt" lrecl=25;
    	    input
		teditseq03 $ 1-14
         	matchkey $ 16-25;
       run;

       proc sort data=match&yr;
       	    by matchkey;
       run;

       /* check that teditseq03 and matchkey have no duplicates in the match dataset. --jz 1/14/13 */
       proc sql;
       select count(distinct teditseq03) as UniqueTESQ,
       	      count(distinct matchkey) as UniqueMatch,
	      count(*) as nobs
	      from match&yr;
       quit;

%mend matchkey;

%matchkey(97);
%matchkey(99);
%matchkey(01);

/* This step merges observations from 1997, 1999, and 2001 using the matchkey variable.
   Dataset now has four variables: matchkey, teditseq97, teditseq99, teditseq01.;*/
data mkids;
    merge ids97 ids99 ids01;
    by matchkey;
run;

/* check that teditseq97, teditseq99, teditseq01 have no duplicates. --jz 1/14/13 */
%macro checker(yr);
data temp;
     set mkids;
     if teditseq&yr ne "";
run;

proc sql;
     select count(distinct teditseq&yr.) as UniqueTESQ&yr.,
	    count(*) as nobs
	    from temp;
quit;
%mend checker;
%checker(97);%checker(99);%checker(01);


/* Merges 97, 99, and 01 matchfiles sent from TU by matchkey. Merging these datasets 
   together gives us a teditseq 2003 value for all the matchkeys we have from 97-01.*/    
data tu_matchfiles;
    merge match97 match99 match01;
    by matchkey;
run; 

/* Merges the dataset containing teditseq for 2003 with the principal dataset with teditseqs
   for 1997-2001. Dataset now has five variables: matchkey, teditseq97, teditseq99, teditseq01, teditseq03.
   Note, this is not the complete set of obs for teditseq03, just those that showed up in earlier year(s).*/
data matched;
    merge mkids tu_matchfiles;
    by matchkey;
run;

proc sort data=matched;
    by teditseq03;
run;

proc print data=matched(obs=10);
title 'this is matched';
run;

/* Creates dataset from 2003 and 2004 score files with only 'person' and 'teditseq' variables, in order
   to merge into 'matched' dataset above.*/
data match03;
    set in_2003.score03 (keep=teditseq person permid rename=teditseq=teditseq03);
run;

proc sort data=match03;
    by person;
run;

/* check that teditseq03, person, permid have no duplicates. --jz 1/14/13 */
proc sql;
     select count(distinct teditseq03) as UniqueTESQ03,
     	    count(distinct person) as UniquePerson,
	    count(distinct permid) as UniquePermID,
	    count(*) as nobs
	    from match03;
quit;

data match04;
    set in.score04 (keep=teditseq person rename=teditseq=teditseq04);
run;

proc sort data=match04;
     by person;
run;

/* check that teditseq04, person have no duplicates. --jz 1/14/13 */
proc sql;
     select count(distinct teditseq04) as UniqueTESQ04,
     	    count(distinct person) as UniquePerson,
	    count(*) as nobs
	    from match04;
quit;

/* Merges 2003 and 2004 score datafiles by person variable.  This dataset has
    three variabes: person, teditseq03 and teditseq04.*/
/* What about the 26 observations in match04 not in match03? --jz 1/14/13*/
/* Why aren't there any people in 2003 who they couldn't find in 2004? --jz 3/12/13*/
data tu_0304;
    merge match03 (in = a) match04 (in = b);
    by person;    
    if a;
run;

proc sort data=tu_0304;
    by teditseq03;
run;

proc print data=tu_0304 (obs=20);
title 'this is tu_0304';
run;

/** match 2007 and 2008 based on permid; */
proc sort data = in_07.tu2007_june out = temp2007 nodupkey; by permid;  run;
proc sort data = in_08.tu2008_dec out = temp2008 nodupkey; by permid; run; 

/* I tested whether the tuedt2003 variable corresponded in 2007 and 2008.  It did for all people present in both years, so we keep the variable from 2007 for now. */
data temp; 
  merge temp2007 (in = a keep = permid tuedt2003 permid2007)
        temp2008 (in = b keep = permid permid2008);
by permid;
if a then tu07 = 1; else tu07 = 0;
if b then tu08 = 1; else tu08 = 0;
run; 

/*
proc freq data = temp; 
     tables tu07*tu08 /missing ; 
run;  
*/

proc sort data = temp; by permid; run;
proc sort data = in_2003.score03 nodupkey out = score03; by permid; run; 

/* I tested whether the tuedt2003 variable in 2007/2008 corresponded to the teditseq variable from 2003.  It did not correspond, so we use the variable from 2003 as correct*/
/*there are 248 observations in 2007,2008 not present in 2003*/
data match0708_03;
  merge temp (in = a keep=permid:)
        score03 (in = b keep = permid teditseq rename=teditseq=teditseq03);
by permid;
if a then tu_07_08 = 1; else tu_07_08 = 0;
if b then tu_03 = 1; else tu_03 = 0;
run;

/*
proc freq data = match0708_03; 
table tu_07_08*tu_03 /missing; 
run; 
*/

data match0708_03(keep=permid teditseq03 permid2008);
     set match0708_03;
     if tu_03 eq 1;
run;

proc contents data=match0708_03;
run;

proc sort data=match0708_03;
     by permid2008;
run;

/*merge 2010 data with 2008 data using permid2008*/
/*I have tried to find a merge with a higher match rate, but have been unable to do so --jmz*/
proc sort data=in_10.rollups10;
     by permid2008;
run;

data match070810_03;
     merge match0708_03(in=in1) in_10.rollups10(in=in2 keep=PermID2008 PermID2010 tueditsequencenumber);
     by permid2008;
     rename tueditsequencenumber=teditseq10;
     if in1;
run;

/* check that permid2010 has no duplicates. --jz */
/* only 30,009 people from the 2010 data can be merged back to 2008 using permid2008 */
/* shira stolarsky performed this merge and sent us the dataset of only the observations that match our sample*/
/* we may need her to send us more data in order to try to get a better merge.  neither shira nor myself are convinced its possible to do better given the variables that we have.*/
proc sql;
     select count(distinct permid2010) as UniquePermid2010,
	    count(*) as nobs
	    from match070810_03;
quit;

proc contents data=match070810_03;
run;


/* Merges matched and old on teditseq03.  The dataset now has 7 variables:
   matchkey, person, teditseq97, teditseq99, teditseq01, teditseq03 and teditseq04.  
   Also, confirm that no agent is included in 97-01 that is not part of the 2003 base sample */

proc sort data=match070810_03;
     by teditseq03;
run;

data out.ids;
    merge matched (in = a) tu_0304 (in = b) match070810_03 (in = c);
    by teditseq03;
    incl_97_01 = 0; incl_03_04 = 0; incl_07_08_10 = 0;
    if a then incl_97_01 = 1;
    if b then incl_03_04 = 1;
    if c then incl_07_08_10 = 1;
run;

proc print data=out.ids (obs=20);
     title 'this is output dataset: 97-10';
run;

/* 3281 observations are in 2003 (maybe 2004), not earlier. --jz 1/14/13 */
proc freq data = out.ids; 
     table incl_97_01*incl_03_04 incl_03_04*incl_07_08_10 / missing; 
run; 

/* Keep only relevant variables */
/* our FINAL person id will be called 'pid'; all people have a pid in all periods*/
data out.ids;
  set out.ids (drop=incl_97_01 incl_03_04 incl_07_08_10);
  pid = teditseq03;
run; 

proc contents data = out.ids; 
run; 

/* Compare the resulting data set out.ids with Karen's original data set in.ids */ 
proc sort data = out.ids; 
     by person; 
run;

proc sort data = in.ids; 
     by person; 
run; 

proc compare base=out.ids compare=in.ids ; 
var person ;
with person; 
title 'compare by person';
run; 

proc compare base=out.ids compare=in.ids ; 
var teditseq03;
with teditseq03; 
title 'compare by teditseq03';
run; 

proc compare base=out.ids compare=in.ids ; 
var teditseq01;
with teditseq01;
title 'compare by teditseq01';
run; 

proc compare base=out.ids compare=in.ids ;
var teditseq99;
with teditseq99;
title 'compare by teditseq99';
run; 

proc compare base=out.ids compare=in.ids ;
var teditseq97;
with teditseq97;
title 'compare by teditseq97';
run; 

